<!DOCTYPE html>
<html lang="zh-cn" dir="ltr">
    <head><meta charset='utf-8'>
<meta name='viewport' content='width=device-width, initial-scale=1'><meta name='description' content='基于mini hash的文档去重 实现代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 from datasketch import MinHash, MinHashLSH from collections import defaultdict # 结束标点符号 END_PUN = set(&amp;#34;.'><title>Mini hash文档去重</title>

<link rel='canonical' href='https://charent.github.io/p/mini-hash%E6%96%87%E6%A1%A3%E5%8E%BB%E9%87%8D/'>

<link rel="stylesheet" href="/scss/style.min.65463afd28d606277b44441c8bbb8b0277823a2d0c03ab8ba9d0567d274f7b43.css"><meta property='og:title' content='Mini hash文档去重'>
<meta property='og:description' content='基于mini hash的文档去重 实现代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 from datasketch import MinHash, MinHashLSH from collections import defaultdict # 结束标点符号 END_PUN = set(&amp;#34;.'>
<meta property='og:url' content='https://charent.github.io/p/mini-hash%E6%96%87%E6%A1%A3%E5%8E%BB%E9%87%8D/'>
<meta property='og:site_name' content='Charent的博客'>
<meta property='og:type' content='article'><meta property='article:section' content='Post' /><meta property='article:published_time' content='2024-01-17T00:00:00&#43;00:00'/><meta property='article:modified_time' content='2024-01-17T00:00:00&#43;00:00'/>
<meta name="twitter:title" content="Mini hash文档去重">
<meta name="twitter:description" content="基于mini hash的文档去重 实现代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 from datasketch import MinHash, MinHashLSH from collections import defaultdict # 结束标点符号 END_PUN = set(&amp;#34;.">
    </head>
    <body class="
    article-page
    ">
    <script>
        (function() {
            const colorSchemeKey = 'StackColorScheme';
            if(!localStorage.getItem(colorSchemeKey)){
                localStorage.setItem(colorSchemeKey, "auto");
            }
        })();
    </script><script>
    (function() {
        const colorSchemeKey = 'StackColorScheme';
        const colorSchemeItem = localStorage.getItem(colorSchemeKey);
        const supportDarkMode = window.matchMedia('(prefers-color-scheme: dark)').matches === true;

        if (colorSchemeItem == 'dark' || colorSchemeItem === 'auto' && supportDarkMode) {
            

            document.documentElement.dataset.scheme = 'dark';
        } else {
            document.documentElement.dataset.scheme = 'light';
        }
    })();
</script>
<div class="container main-container flex on-phone--column extended"><aside class="sidebar left-sidebar sticky ">
    <button class="hamburger hamburger--spin" type="button" id="toggle-menu" aria-label="切换菜单">
        <span class="hamburger-box">
            <span class="hamburger-inner"></span>
        </span>
    </button>

    <header>
        
            
            <figure class="site-avatar">
                <a href="/">
                
                    
                    
                    
                        
                        <img src="/img/avatar_hu544a9780599d84daff7ca906f17dd84c_13353_300x0_resize_box_3.png" width="300"
                            height="300" class="site-logo" loading="lazy" alt="Avatar">
                    
                
                </a>
                
                    <span class="emoji">😆</span>
                
            </figure>
            
        
        
        <div class="site-meta">
            <h1 class="site-name"><a href="/">Charent的博客</a></h1>
            <h2 class="site-description">不积硅步，无以至千里</h2>
        </div>
    </header><ol class="social-menu">
            
                <li>
                    <a 
                        href='https://github.com/charent'
                        target="_blank"
                        title="GitHub"
                    >
                        
                        
                            <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-brand-github" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
  <path d="M9 19c-4.3 1.4 -4.3 -2.5 -6 -3m12 5v-3.5c0 -1 .1 -1.4 -.5 -2c2.8 -.3 5.5 -1.4 5.5 -6a4.6 4.6 0 0 0 -1.3 -3.2a4.2 4.2 0 0 0 -.1 -3.2s-1.1 -.3 -3.5 1.3a12.3 12.3 0 0 0 -6.2 0c-2.4 -1.6 -3.5 -1.3 -3.5 -1.3a4.2 4.2 0 0 0 -.1 3.2a4.6 4.6 0 0 0 -1.3 3.2c0 4.6 2.7 5.7 5.5 6c-.6 .6 -.6 1.2 -.5 2v3.5" />
</svg>



                        
                    </a>
                </li>
            
        </ol><ol class="menu" id="main-menu">
        
        
        

        <li >
            <a href='/' >
                
                
                
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-home" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <polyline points="5 12 3 12 12 3 21 12 19 12" />
  <path d="M5 12v7a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-7" />
  <path d="M9 21v-6a2 2 0 0 1 2 -2h2a2 2 0 0 1 2 2v6" />
</svg>



                
                <span>主页</span>
            </a>
        </li>
        
        

        <li >
            <a href='/archives/' >
                
                
                
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-archive" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <rect x="3" y="4" width="18" height="4" rx="2" />
  <path d="M5 8v10a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-10" />
  <line x1="10" y1="12" x2="14" y2="12" />
</svg>



                
                <span>归档</span>
            </a>
        </li>
        
        

        <li >
            <a href='/search/' >
                
                
                
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-search" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <circle cx="10" cy="10" r="7" />
  <line x1="21" y1="21" x2="15" y2="15" />
</svg>



                
                <span>搜索</span>
            </a>
        </li>
        
        

        <li >
            <a href='/%E5%85%B3%E4%BA%8E/' >
                
                
                
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-link" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <path d="M10 14a3.5 3.5 0 0 0 5 0l4 -4a3.5 3.5 0 0 0 -5 -5l-.5 .5" />
  <path d="M14 10a3.5 3.5 0 0 0 -5 0l-4 4a3.5 3.5 0 0 0 5 5l.5 -.5" />
</svg>



                
                <span>关于</span>
            </a>
        </li>
        

        <div class="menu-bottom-section">
                <li id="i18n-switch">  
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-language" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z" fill="none"/>
  <path d="M4 5h7" />
  <path d="M9 3v2c0 4.418 -2.239 8 -5 8" />
  <path d="M5 9c-.003 2.144 2.952 3.908 6.7 4" />
  <path d="M12 20l4 -9l4 9" />
  <path d="M19.1 18h-6.2" />
</svg>



                    <select name="language" onchange="window.location.href = this.selectedOptions[0].value">
                        
                            <option value="https://charent.github.io/" selected>中文</option>
                        
                            <option value="https://charent.github.io/en/" >English</option>
                        
                    </select>
                </li>
            
            
            
                <li id="dark-mode-toggle">
                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-toggle-left" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <circle cx="8" cy="12" r="2" />
  <rect x="2" y="6" width="20" height="12" rx="6" />
</svg>



                    <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-toggle-right" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <circle cx="16" cy="12" r="2" />
  <rect x="2" y="6" width="20" height="12" rx="6" />
</svg>



                    <span>暗色模式</span>
                </li>
            
        </div>
    </ol>
</aside>
<main class="main full-width">
    <article class="main-article">
    <header class="article-header">

    <div class="article-details">
    
    <header class="article-category">
        
            <a href="/categories/%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86/" >
                数据处理
            </a>
        
    </header>
    

    <div class="article-title-wrapper">
        <h2 class="article-title">
            <a href="/p/mini-hash%E6%96%87%E6%A1%A3%E5%8E%BB%E9%87%8D/">Mini hash文档去重</a>
        </h2>
    
        
    </div>

    
    <footer class="article-time">
        
            <div>
                <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-calendar-time" width="56" height="56" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <path d="M11.795 21h-6.795a2 2 0 0 1 -2 -2v-12a2 2 0 0 1 2 -2h12a2 2 0 0 1 2 2v4" />
  <circle cx="18" cy="18" r="4" />
  <path d="M15 3v4" />
  <path d="M7 3v4" />
  <path d="M3 11h16" />
  <path d="M18 16.496v1.504l1 1" />
</svg>
                <time class="article-time--published">2024-01-17</time>
            </div>
        

        
    </footer>
    

    
</div>
</header>

    <section class="article-content">
    
    
    <h1 id="基于mini-hash的文档去重">基于mini hash的文档去重</h1>
<h2 id="实现代码">实现代码</h2>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre tabindex="0" class="chroma"><code><span class="lnt"> 1
</span><span class="lnt"> 2
</span><span class="lnt"> 3
</span><span class="lnt"> 4
</span><span class="lnt"> 5
</span><span class="lnt"> 6
</span><span class="lnt"> 7
</span><span class="lnt"> 8
</span><span class="lnt"> 9
</span><span class="lnt">10
</span><span class="lnt">11
</span><span class="lnt">12
</span><span class="lnt">13
</span><span class="lnt">14
</span><span class="lnt">15
</span><span class="lnt">16
</span><span class="lnt">17
</span><span class="lnt">18
</span><span class="lnt">19
</span><span class="lnt">20
</span><span class="lnt">21
</span><span class="lnt">22
</span><span class="lnt">23
</span><span class="lnt">24
</span><span class="lnt">25
</span><span class="lnt">26
</span><span class="lnt">27
</span><span class="lnt">28
</span><span class="lnt">29
</span><span class="lnt">30
</span><span class="lnt">31
</span><span class="lnt">32
</span><span class="lnt">33
</span><span class="lnt">34
</span><span class="lnt">35
</span><span class="lnt">36
</span><span class="lnt">37
</span><span class="lnt">38
</span><span class="lnt">39
</span><span class="lnt">40
</span><span class="lnt">41
</span><span class="lnt">42
</span><span class="lnt">43
</span><span class="lnt">44
</span><span class="lnt">45
</span><span class="lnt">46
</span><span class="lnt">47
</span><span class="lnt">48
</span><span class="lnt">49
</span><span class="lnt">50
</span><span class="lnt">51
</span><span class="lnt">52
</span><span class="lnt">53
</span><span class="lnt">54
</span><span class="lnt">55
</span><span class="lnt">56
</span><span class="lnt">57
</span><span class="lnt">58
</span><span class="lnt">59
</span><span class="lnt">60
</span><span class="lnt">61
</span><span class="lnt">62
</span><span class="lnt">63
</span></code></pre></td>
<td class="lntd">
<pre tabindex="0" class="chroma"><code class="language-python" data-lang="python"><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="kn">from</span> <span class="nn">datasketch</span> <span class="kn">import</span> <span class="n">MinHash</span><span class="p">,</span> <span class="n">MinHashLSH</span>
</span></span><span class="line"><span class="cl"><span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1"># 结束标点符号</span>
</span></span><span class="line"><span class="cl"><span class="n">END_PUN</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="s2">&#34;.。!！）)》}】?？</span><span class="se">\&#34;</span><span class="s2">”&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1"># 保留中文和英文、下划线，不要标点符号</span>
</span></span><span class="line"><span class="cl"><span class="n">NON_CHAR</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s2">&#34;[^[</span><span class="se">\u4E00</span><span class="s2">-</span><span class="se">\u9FA5</span><span class="s2">|A-Za-z_0-9]&#34;</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="k">def</span> <span class="nf">_get_doc_mini_hash</span><span class="p">(</span><span class="n">doc</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">|</span> <span class="nb">str</span><span class="p">,</span> <span class="n">num_perm</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">MinHash</span><span class="p">:</span>
</span></span><span class="line"><span class="cl">    <span class="s1">&#39;&#39;&#39;
</span></span></span><span class="line"><span class="cl"><span class="s1">    获取一段文本的mini hash
</span></span></span><span class="line"><span class="cl"><span class="s1">    &#39;&#39;&#39;</span>
</span></span><span class="line"><span class="cl">    <span class="n">mini_hash</span> <span class="o">=</span> <span class="n">MinHash</span><span class="p">(</span><span class="n">num_perm</span><span class="o">=</span><span class="n">num_perm</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">    <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">doc</span><span class="p">:</span>
</span></span><span class="line"><span class="cl">        <span class="n">mini_hash</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">s</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">))</span>
</span></span><span class="line"><span class="cl">    <span class="k">return</span> <span class="n">mini_hash</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="k">class</span> <span class="nc">DropDatasetDuplicate</span><span class="p">:</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>  <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span><span class="o">=</span><span class="mf">0.85</span><span class="p">,</span> <span class="n">num_perm</span><span class="p">:</span> <span class="nb">int</span><span class="o">=</span><span class="mi">256</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
</span></span><span class="line"><span class="cl">        <span class="s1">&#39;&#39;&#39;
</span></span></span><span class="line"><span class="cl"><span class="s1">        获取一个数据集中所有重复（相似的超过threshold）的index，输入为：list[str]，一个str元素为一段文本(doc)
</span></span></span><span class="line"><span class="cl"><span class="s1">        如输入： [a, b, c, d, c, d, e] 返回：{4, 5} (后面两个 c, d 的index)
</span></span></span><span class="line"><span class="cl"><span class="s1">        &#39;&#39;&#39;</span>
</span></span><span class="line"><span class="cl">        <span class="bp">self</span><span class="o">.</span><span class="n">similar_index_cluster</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">set</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">        <span class="bp">self</span><span class="o">.</span><span class="n">data_lsh</span> <span class="o">=</span> <span class="n">MinHashLSH</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="n">threshold</span><span class="p">,</span> <span class="n">num_perm</span><span class="o">=</span><span class="n">num_perm</span><span class="p">)</span> 
</span></span><span class="line"><span class="cl">        <span class="bp">self</span><span class="o">.</span><span class="n">num_perm</span> <span class="o">=</span> <span class="n">num_perm</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">    <span class="k">def</span> <span class="nf">add_doc</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="nb">object</span><span class="p">,</span> <span class="n">doc</span><span class="p">:</span> <span class="nb">str</span><span class="p">,)</span> <span class="o">-&gt;</span> <span class="nb">set</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
</span></span><span class="line"><span class="cl">        <span class="s1">&#39;&#39;&#39;
</span></span></span><span class="line"><span class="cl"><span class="s1">        添加文档，
</span></span></span><span class="line"><span class="cl"><span class="s1">        index： 文档的索引
</span></span></span><span class="line"><span class="cl"><span class="s1">        doc: 文档本身
</span></span></span><span class="line"><span class="cl"><span class="s1">        &#39;&#39;&#39;</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">        <span class="c1"># 只保留中文和英文、下划线，不要标点符号</span>
</span></span><span class="line"><span class="cl">        <span class="n">doc</span> <span class="o">=</span> <span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">NON_CHAR</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">doc</span><span class="p">))</span>
</span></span><span class="line"><span class="cl">        <span class="c1"># doc = [&#39;&#39;.join(t) for t in list(ngrams(doc, 3))]</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">        <span class="n">doc_hash</span> <span class="o">=</span> <span class="n">_get_doc_mini_hash</span><span class="p">(</span><span class="n">doc</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_perm</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">        <span class="n">close_duplicates</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">data_lsh</span><span class="o">.</span><span class="n">query</span><span class="p">(</span><span class="n">doc_hash</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">        <span class="bp">self</span><span class="o">.</span><span class="n">data_lsh</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">doc_hash</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">        <span class="c1"># 所有相似的doc在similar_index_cluster中的key都是最早出现的idx</span>
</span></span><span class="line"><span class="cl">        <span class="c1"># 如：data中索引inndex 2, 7, 8, 9, 10, 12 是相似的，则在similar_index_cluster中表现为 {2: {8, 9, 10, 12}}</span>
</span></span><span class="line"><span class="cl">        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">close_duplicates</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
</span></span><span class="line"><span class="cl">            <span class="n">min_idx</span><span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">close_duplicates</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">            <span class="bp">self</span><span class="o">.</span><span class="n">similar_index_cluster</span><span class="p">[</span><span class="n">min_idx</span><span class="p">]</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">    
</span></span><span class="line"><span class="cl">    <span class="k">def</span> <span class="nf">get_duplicate_indexs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
</span></span><span class="line"><span class="cl">        <span class="s1">&#39;&#39;&#39;
</span></span></span><span class="line"><span class="cl"><span class="s1">        返回所有的重复文档索引
</span></span></span><span class="line"><span class="cl"><span class="s1">        &#39;&#39;&#39;</span>
</span></span><span class="line"><span class="cl">        <span class="n">similar_index_cluster</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">similar_index_cluster</span>
</span></span><span class="line"><span class="cl">        <span class="n">need_to_remove_idx</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
</span></span><span class="line"><span class="cl">        
</span></span><span class="line"><span class="cl">        <span class="k">for</span> <span class="n">key_idx</span> <span class="ow">in</span> <span class="n">similar_index_cluster</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
</span></span><span class="line"><span class="cl">            <span class="n">need_to_remove_idx</span> <span class="o">|=</span> <span class="n">similar_index_cluster</span><span class="p">[</span><span class="n">key_idx</span><span class="p">]</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">        <span class="k">return</span> <span class="n">need_to_remove_idx</span>
</span></span></code></pre></td></tr></table>
</div>
</div><h2 id="使用方法">使用方法</h2>
<p>单进程速度非常慢，需要多进程处理（待补充）</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre tabindex="0" class="chroma"><code><span class="lnt"> 1
</span><span class="lnt"> 2
</span><span class="lnt"> 3
</span><span class="lnt"> 4
</span><span class="lnt"> 5
</span><span class="lnt"> 6
</span><span class="lnt"> 7
</span><span class="lnt"> 8
</span><span class="lnt"> 9
</span><span class="lnt">10
</span></code></pre></td>
<td class="lntd">
<pre tabindex="0" class="chroma"><code class="language-python" data-lang="python"><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="c1"># 先顺序遍历获取哪些行是重复的</span>
</span></span><span class="line"><span class="cl"><span class="k">for</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">response</span> <span class="ow">in</span> <span class="n">progress</span><span class="o">.</span><span class="n">track</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">parquet_table</span><span class="p">[</span><span class="s1">&#39;prompt&#39;</span><span class="p">],</span> <span class="n">parquet_table</span><span class="p">[</span><span class="s1">&#39;response&#39;</span><span class="p">]),</span> <span class="n">total</span><span class="o">=</span><span class="n">parquet_table</span><span class="o">.</span><span class="n">num_rows</span><span class="p">):</span>
</span></span><span class="line"><span class="cl">    <span class="n">row_index</span> <span class="o">+=</span> <span class="mi">1</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl">    <span class="n">doc</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&#34;</span><span class="si">{</span><span class="n">prompt</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span><span class="si">}{</span><span class="n">response</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span><span class="si">}</span><span class="s2">&#34;</span>
</span></span><span class="line"><span class="cl">    <span class="n">drop_dataset_duplicate</span><span class="o">.</span><span class="n">add_doc</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">row_index</span><span class="p">,</span> <span class="n">doc</span><span class="o">=</span><span class="n">doc</span><span class="p">)</span>
</span></span><span class="line"><span class="cl">
</span></span><span class="line"><span class="cl"><span class="n">row_index</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span>
</span></span><span class="line"><span class="cl"><span class="n">need_to_drop_indexs</span> <span class="o">=</span> <span class="n">drop_dataset_duplicate</span><span class="o">.</span><span class="n">get_duplicate_indexs</span><span class="p">()</span>
</span></span></code></pre></td></tr></table>
</div>
</div>
</section>


    <footer class="article-footer">
    

    </footer>


    
        <link 
                rel="stylesheet" 
                href="https://cdn.jsdelivr.net/npm/katex@0.13.13/dist/katex.min.css"integrity="sha384-RZU/ijkSsFbcmivfdRBQDtwuwVqK7GMOw6IMvKyeWL2K5UAlyp6WonmB8m7Jd0Hn"crossorigin="anonymous"
            ><script 
                src="https://cdn.jsdelivr.net/npm/katex@0.13.13/dist/katex.min.js"integrity="sha384-pK1WpvzWVBQiP0/GjnvRxV4mOb0oxFuyRxJlk6vVw146n3egcN5C925NCP7a7BY8"crossorigin="anonymous"
                defer
                >
            </script><script 
                src="https://cdn.jsdelivr.net/npm/katex@0.13.13/dist/contrib/auto-render.min.js"integrity="sha384-vZTG03m&#43;2yp6N6BNi5iM4rW4oIwk5DfcNdFfxkk9ZWpDriOkXX8voJBFrAO7MpVl"crossorigin="anonymous"
                defer
                >
            </script><script>
    window.addEventListener("DOMContentLoaded", () => {
        renderMathInElement(document.querySelector(`.article-content`), {
            delimiters: [
                { left: "$$", right: "$$", display: true },
                { left: "$", right: "$", display: false },
                { left: "\\(", right: "\\)", display: false },
                { left: "\\[", right: "\\]", display: true }
            ]
        });})
</script>
    
</article>

    

    

     
    
        
    

    <footer class="site-footer">
    <section class="copyright">
        &copy; 
        
            2018 - 
        
        2024 Charent的博客
    </section>
    
    <section class="powerby">
         <br />
        
    </section>
</footer>


    
<div class="pswp" tabindex="-1" role="dialog" aria-hidden="true">

    
    <div class="pswp__bg"></div>

    
    <div class="pswp__scroll-wrap">

        
        <div class="pswp__container">
            <div class="pswp__item"></div>
            <div class="pswp__item"></div>
            <div class="pswp__item"></div>
        </div>

        
        <div class="pswp__ui pswp__ui--hidden">

            <div class="pswp__top-bar">

                

                <div class="pswp__counter"></div>

                <button class="pswp__button pswp__button--close" title="Close (Esc)"></button>

                <button class="pswp__button pswp__button--share" title="Share"></button>

                <button class="pswp__button pswp__button--fs" title="Toggle fullscreen"></button>

                <button class="pswp__button pswp__button--zoom" title="Zoom in/out"></button>

                
                
                <div class="pswp__preloader">
                    <div class="pswp__preloader__icn">
                        <div class="pswp__preloader__cut">
                            <div class="pswp__preloader__donut"></div>
                        </div>
                    </div>
                </div>
            </div>

            <div class="pswp__share-modal pswp__share-modal--hidden pswp__single-tap">
                <div class="pswp__share-tooltip"></div>
            </div>

            <button class="pswp__button pswp__button--arrow--left" title="Previous (arrow left)">
            </button>

            <button class="pswp__button pswp__button--arrow--right" title="Next (arrow right)">
            </button>

            <div class="pswp__caption">
                <div class="pswp__caption__center"></div>
            </div>

        </div>

    </div>

</div><script 
                src="https://cdn.jsdelivr.net/npm/photoswipe@4.1.3/dist/photoswipe.min.js"integrity="sha256-ePwmChbbvXbsO02lbM3HoHbSHTHFAeChekF1xKJdleo="crossorigin="anonymous"
                defer
                >
            </script><script 
                src="https://cdn.jsdelivr.net/npm/photoswipe@4.1.3/dist/photoswipe-ui-default.min.js"integrity="sha256-UKkzOn/w1mBxRmLLGrSeyB4e1xbrp4xylgAWb3M42pU="crossorigin="anonymous"
                defer
                >
            </script><link 
                rel="stylesheet" 
                href="https://cdn.jsdelivr.net/npm/photoswipe@4.1.3/dist/default-skin/default-skin.css"integrity="sha256-c0uckgykQ9v5k&#43;IqViZOZKc47Jn7KQil4/MP3ySA3F8="crossorigin="anonymous"
            ><link 
                rel="stylesheet" 
                href="https://cdn.jsdelivr.net/npm/photoswipe@4.1.3/dist/photoswipe.css"integrity="sha256-SBLU4vv6CA6lHsZ1XyTdhyjJxCjPif/TRkjnsyGAGnE="crossorigin="anonymous"
            >

            </main>
    <aside class="sidebar right-sidebar sticky">
        
            
                
    <section class="widget archives">
        <div class="widget-icon">
            <svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-hash" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
  <path stroke="none" d="M0 0h24v24H0z"/>
  <line x1="5" y1="9" x2="19" y2="9" />
  <line x1="5" y1="15" x2="19" y2="15" />
  <line x1="11" y1="4" x2="7" y2="20" />
  <line x1="17" y1="4" x2="13" y2="20" />
</svg>



        </div>
        <h2 class="widget-title section-title">目录</h2>
        
        <div class="widget--toc">
            <nav id="TableOfContents">
  <ol>
    <li><a href="#实现代码">实现代码</a></li>
    <li><a href="#使用方法">使用方法</a></li>
  </ol>
</nav>
        </div>
    </section>

            
        
    </aside>


        </div>
        <script 
                src="https://cdn.jsdelivr.net/npm/node-vibrant@3.1.5/dist/vibrant.min.js"integrity="sha256-5NovOZc4iwiAWTYIFiIM7DxKUXKWvpVEuMEPLzcm5/g="crossorigin="anonymous"
                
                >
            </script><script type="text/javascript" src="/ts/main.js" defer></script>
<script>
    (function () {
        const customFont = document.createElement('link');
        customFont.href = "https://fonts.googleapis.com/css2?family=Lato:wght@300;400;700&display=swap";

        customFont.type = "text/css";
        customFont.rel = "stylesheet";

        document.head.appendChild(customFont);
    }());
</script>

    </body>
</html>
